Import library

library(dada2)
packageVersion("dada2") # check dada2 version
## [1] '1.21.0'
library(Biostrings)
library(ShortRead)
library(seqTools) # per base sequence content
library(phyloseq)
library(ggplot2)
library(data.table)
library(plyr)
library(dplyr)
library(qckitfastq) # per base sequence content
library(stringr)

QUALITY CHECK

First, we import the fastq files containing the raw reads. The samples were downloaded from the SRA database with the accession number PRJEB11419. IBS and healthy samples were selected out of the thousands of samples as described in 00_Metadata-AGP.R.

# Save the path to the directory containing the fastq zipped files
path <- "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP"
# list.files(path) # check we are in the right directory

# fastq filenames have format: SAMPLENAME.fastq.gz
# Saves the whole directory path to each file name
FNs <- sort(list.files(file.path(path, "original"), pattern=".fastq.gz", full.names = TRUE))
show(FNs[1:5])
## [1] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1073039_1.fastq.gz"
## [2] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1073655_1.fastq.gz"
## [3] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1073725_1.fastq.gz"
## [4] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1074570_1.fastq.gz"
## [5] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1074573_1.fastq.gz"
# Extract sample names, assuming filenames have format: SAMPLENAME.fastq
sample.names <- sapply(strsplit(basename(FNs), "_"), `[`, 1)
show(sample.names[1:5]) # saves only the file name (without the path)
## [1] "ERR1073039" "ERR1073655" "ERR1073725" "ERR1074570" "ERR1074573"
# Look at quality of all files
for (i in 1:4){ # 1:length(FNs)
  show(plotQualityProfile(FNs[i]))
}

# Look at nb of reads per sample
# raw_stats <- data.frame('sample' = sample.names,
#                         'reads' = fastqq(FNs)@nReads)
# min(raw_stats$reads)
# max(raw_stats$reads)
# mean(raw_stats$reads)
# hist(log10(raw_stats$reads))

We will have a quick peak at the per base sequence content of the reads in some samples, to make sure there is no anomaly (i.e. all reads having the same sequence).

# Look at per base sequence content
fseq <- seqTools::fastqq(FNs[4])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1074570_1.fastq.gz'  done.
rc <- read_content(fseq)
plot_read_content(rc) + labs(title = "Per base sequence content")

plot_read_content(rc) + xlim(0,50) + labs(title = "Per base sequence content")

LOOK FOR PRIMERS

Now, we will look whether the reads still contain the primers.

# Import metadata, that contains the primer sequences
metadata <- read.csv(file.path(path, "Metadata-AGP.csv"))
table(metadata$pcr_primers..exp.)
## 
## FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 
##                                                45 
## FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT 
##                                              1245
# V4
FWD <- "GTGYCAGCMGCCGCGGTAA"  # 515F primer sequence
REV <- "GGACTACHVGGGTWTCTAAT" # 806R primer sequence

# Function that, from the primer sequence, will return all combinations possible (complement, reverse complement, ...)
allOrients <- function(primer) {
    # Create all orientations of the input sequence
    require(Biostrings)
    dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
    orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
        RevComp = reverseComplement(dna))
    return(sapply(orients, toString))  # Convert back to character vector
}

# Get all combinations of the primer sequences
FWD.orients <- allOrients(FWD) # 515F
REV.orients <- allOrients(REV) # 806R
FWD.orients # sanity check
##               Forward            Complement               Reverse 
## "GTGYCAGCMGCCGCGGTAA" "CACRGTCGKCGGCGCCATT" "AATGGCGCCGMCGACYGTG" 
##               RevComp 
## "TTACCGCGGCKGCTGRCAC"
REV.orients
##                Forward             Complement                Reverse 
## "GGACTACHVGGGTWTCTAAT" "CCTGATGDBCCCAWAGATTA" "TAATCTWTGGGVHCATCAGG" 
##                RevComp 
## "ATTAGAWACCCBDGTAGTCC"
# Function that counts number of reads in which a sequence is found
primerHits <- function(primer, fn) {
    # Counts number of reads in which the primer is found
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2)
    return(sum(nhits > 0))
}

# Get a table to know how many times the 515F and 806R primers are found in the reads of each sample
for (i in 1:4){ # 1:length(FNs)
  cat("SAMPLE", sample.names[i], "with total number of", raw_stats[i,'reads'], "reads\n\n")
  x <-  rbind(ForwardPrimer = sapply(FWD.orients, primerHits, fn = FNs[[i]]), 
              ReversePrimer = sapply(REV.orients, primerHits, fn = FNs[[i]]))
  print(x)
  cat("\n____________________________________________\n\n")
}
## SAMPLE ERR1073039 with total number of 17661 reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer       0          0       0       0
## ReversePrimer       0          0       0       4
## 
## ____________________________________________
## 
## SAMPLE ERR1073655 with total number of 53710 reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer       0          0       0       0
## ReversePrimer       0          0       0      33
## 
## ____________________________________________
## 
## SAMPLE ERR1073725 with total number of 26291 reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer       0          0       0       0
## ReversePrimer       0          0       0       8
## 
## ____________________________________________
## 
## SAMPLE ERR1074570 with total number of 27191 reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer       5          0       0       0
## ReversePrimer       2          0       0    1600
## 
## ____________________________________________

FILTER AND TRIM

1. Primer removal

The reads do not contain any primer. We will go forward directly with the quality filtering.

2. Quality filtering

Then, we perform a quality filtering of the reads.

# Place filtered files in a filtered/ subdirectory
filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_filt.fastq.gz"))
# Assign names for the filtered fastq.gz files
names(filt2_samples) <- sample.names

out2 <- filterAndTrim(fwd = FNs, filt = filt2_samples,
                      maxEE=3, # reads with more than 3 expected errors (sum(10e(-Q/10))) are discarded
                      truncQ=10, # Truncate reads at the first instance of a quality score less than or equal to truncQ.
                      minLen=150, # Discard reads shorter than 150 bp.
                      multithread=TRUE,
                      compress=TRUE,
                      verbose=TRUE)


# Update list of filtered files
FNs_filt <- sort(list.files(file.path(path, "filtered2"), pattern=".fastq.gz", full.names = TRUE))
sample.names_filt <- sapply(strsplit(basename(FNs_filt), "_"), `[`, 1)
names(FNs_filt) <- sample.names_filt

About ~35 samples had a large number of reads but none of them passed the quality filter. Let’s inspect quickly the quality profile of some of them.

for (i in 195:234){ # printing 2 samples before & after the "abnormal samples", that actually passed the quality filter (as comparison)
  show(plotQualityProfile(FNs[i]))
}

The quality profile of these discarded samples show some abnormalities, and reads were also shorter than 150bp. It is not surprising none of the reads passed the quality filtering.

LEARN ERROR RATES

Now we will build the parametric error model, to be able to infer amplicon sequence variants (ASVs) later on.

err <- learnErrors(FNs_filt, multithread=TRUE, randomize=TRUE, verbose = 1)

The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected.

plotErrors(err, nominalQ = TRUE)

CONSTRUCT SEQUENCE TABLE

1. Infer sample composition

The dada() algorithm infers sequence variants based on estimated errors (previous step). Firstly, we de-replicate the reads in each sample, to reduce the computation time. De-replication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.

# Prepare empty vector for the infered sequences
seq_infered <- vector("list", length(sample.names_filt))
names(seq_infered) <- sample.names_filt # names of each row will be the sample names

# Iterate through the 1242 samples
for(sampl in sample.names_filt) {
  cat("Processing:", sampl, "\n") # print which sample is being processed
  derep <- derepFastq(FNs_filt[[sampl]]) # dereplicate the reads in the sample
  seq_infered[[sampl]] <- dada(derep, err=err, multithread=TRUE) # default parameters
}
# Inspect the infered sequence variants from sample 1:5
for (i in 1:5){
  print(seq_infered[[i]])
  print("________________")
}
## dada-class: object describing DADA2 denoising results
## 169 sequence variants were inferred from 3842 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 131 sequence variants were inferred from 4890 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 281 sequence variants were inferred from 6215 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 214 sequence variants were inferred from 6000 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 325 sequence variants were inferred from 6087 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"

2. Construct ASV table

We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.

# Make sequence table from the infered sequence variants
seqtable <- makeSequenceTable(seq_infered)

# We should have 1242 samples (1242 rows)
dim(seqtable)
## [1]  1242 28348
# Inspect distribution of sequence lengths
hist(nchar(getSequences(seqtable)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

3. Remove chimeras

The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.

seqtable.nochim <- removeBimeraDenovo(seqtable, method="consensus", multithread=TRUE, verbose=TRUE)

# Check how many sequence variants we have after removing chimeras
dim(seqtable.nochim)
## [1]  1242 18673
# Keep only ASVs that are present in at least 2 samples
seqtable.nochim <- seqtable.nochim[, colSums(seqtable.nochim > 0) >=2]
dim(seqtable.nochim)
## [1] 1242 6512
# Check how many reads we have after removing chimeras (we should keep the vast majority of the reads, like > 80%)
sum(seqtable.nochim)/sum(seqtable)
## [1] 0.9410504

LOOK AT READS COUNT THROUGH PIPELINE

Sanity check before assigning taxonomy.

# Function that counts nb of reads
getN <- function(x) sum(getUniques(x))

# Table that will count number of reads for each process of interest (input reads, filtered reads, denoised reads, non chimera reads)
rownames(out2) <- sapply(strsplit(rownames(out2), "_"), `[`, 1)
track <- out2 %>%
  as.data.frame() %>%
  mutate(Run=rownames(out2)) %>%
  rename("input"="reads.in",
         "quality-filt"="reads.out") %>%
  left_join(sapply(seq_infered, getN) %>%
              as.data.frame %>% 
              rename(denoised=".") %>%
              mutate(Run=names(sapply(seq_infered, getN))),
            by="Run") %>%
  left_join(rowSums(seqtable.nochim) %>%
              as.data.frame %>%
              rename(nonchim=".") %>%
              mutate(Run=rownames(seqtable.nochim)),
            by="Run") %>%
  relocate(Run) %>%
  mutate("%input->output"=as.integer(nonchim*100/input))

# Show final table: for each row/sample, we have shown the initial number of reads, filtered reads, denoised reads, and non chimera reads
track

ASSIGN TAXONOMY

Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.

# Assign taxonomy (with silva v138)
taxa <- assignTaxonomy(seqtable.nochim, "~/Projects/IBS_Meta-analysis_16S/data/silva_nr_v138_train_set.fa.gz",
                       tryRC = TRUE, # try reverse complement of the sequences
                       multithread=TRUE, verbose = TRUE)

# Add species assignment
taxa <- addSpecies(taxa, "~/Projects/IBS_Meta-analysis_16S/data/silva_species_assignment_v138.fa.gz")
# Check how the taxonomy table looks like
taxa.print <- taxa
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
##      Kingdom    Phylum           Class                 Order             
## [1,] "Bacteria" "Proteobacteria" "Gammaproteobacteria" "Enterobacterales"
## [2,] "Bacteria" "Proteobacteria" "Gammaproteobacteria" "Enterobacterales"
## [3,] "Bacteria" "Bacteroidota"   "Bacteroidia"         "Bacteroidales"   
## [4,] "Bacteria" "Bacteroidota"   "Bacteroidia"         "Bacteroidales"   
## [5,] "Bacteria" "Bacteroidota"   "Bacteroidia"         "Bacteroidales"   
## [6,] "Bacteria" "Firmicutes"     "Clostridia"          "Oscillospirales" 
##      Family               Genus                  Species    
## [1,] "Enterobacteriaceae" "Escherichia/Shigella" NA         
## [2,] "Enterobacteriaceae" "Escherichia/Shigella" NA         
## [3,] "Bacteroidaceae"     "Bacteroides"          "vulgatus" 
## [4,] "Bacteroidaceae"     "Bacteroides"          "vulgatus" 
## [5,] "Bacteroidaceae"     "Bacteroides"          "uniformis"
## [6,] "Ruminococcaceae"    "Faecalibacterium"     NA
table(taxa.print[,1]) # Show the different kingdoms
## 
##   Archaea  Bacteria Eukaryota 
##        13      6458        36
table(taxa.print[,2]) # Show the different phyla
## 
##   Acidobacteriota  Actinobacteriota      Bacteroidota     Basidiomycota 
##                 4               276               919                 5 
##  Campilobacterota     Crenarchaeota     Cyanobacteria      Deinococcota 
##                11                 1                65                 7 
##  Desulfobacterota   Elusimicrobiota        Euglenozoa     Euryarchaeota 
##                62                 2                 1                 5 
##        Firmicutes    Fusobacteriota    Incertae_Sedis      Mucoromycota 
##              4544                27                 5                 2 
##   Patescibacteria    Proteobacteria     Spirochaetota      Synergistota 
##                 4               429                 1                14 
##  Thermoplasmatota Verrucomicrobiota 
##                 7                54
table(is.na(taxa.print[,2])) # is there any NA phyla?
## 
## FALSE  TRUE 
##  6445    67

LAST PREPROCESSING

We will remove any sample with less than 500 reads from further analysis, and also any ASVs with unassigned phyla.

1. Create phyloseq object

The preprocessing will be easier to do with ASV, taxonomic and metadata tables combined in a phyloseq object.

#_________________________
# Import metadata
metadata <- read.csv(file.path(path, "Metadata-AGP.csv"))

# Keep relevant covariates
sampledf <- metadata %>%
  select(-"pcr_primers..exp.")
rownames(sampledf) <- sampledf$Run

sampledf$author <- "AGP"
sampledf$sequencing_tech <- "Illumina"
sampledf$variable_region <- "V4"

#_________________________
# Create phyloseq object
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
                   sample_data(sampledf), 
                   tax_table(taxa))

# Remove taxa that are eukaryota, or have unassigned Phyla
physeq <- subset_taxa(physeq, Kingdom != "Eukaryota")
physeq <- subset_taxa(physeq, !is.na(Phylum))
# Remove samples with less than 500 reads
physeq <- prune_samples(sample_sums(physeq)>=500, physeq)

2. Save to disk

# Save to disk
saveRDS(raw_stats, file.path(path, "01_Dada2-AGP/raw_stats.rds"))
saveRDS(out2, file.path(path, "01_Dada2-AGP/out2.rds"))
saveRDS(err, file.path(path, "01_Dada2-AGP/err.rds"))
saveRDS(seq_infered, file.path(path, "01_Dada2-AGP/seq_infered.rds"))


saveRDS(otu_table(physeq), file.path(path, "01_Dada2-AGP/ASVtable_final.rds")) # ASV table
saveRDS(tax_table(physeq), file.path(path, "01_Dada2-AGP/taxa_final.rds")) # taxa table
saveRDS(sample_data(physeq), file.path(path, "01_Dada2-AGP/metadata_final.rds")) # metadata table

# Taxa & Phyloseq object
saveRDS(taxa, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/Taxonomy/output/taxa_agp.rds")
saveRDS(physeq, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/PhyloTree/input/physeq_agp.rds")

3. Quick peek at data analysis

# Absolute abundance
# plot_bar(physeq, fill = "Phylum")+ facet_wrap("host_disease", scales="free") + theme(axis.text.x = element_blank()) # long to compute

# Relative abundance for Phylum
phylum.table <- physeq %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                         y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Relative abundance")